

***SET THE DIRECToRY to Replication_package
clear
global rootpath "D:\Replication_package"


********************************************************************************
//Here I'm first putting working_file_sep2023.dta together
clear
import delimited "$rootpath/raw_data/ICIS-AIR_FACILITIES.csv"
rename registry_id frsnumber
sort pgm_sys_id
quietly by pgm_sys_id:  gen dup = cond(_N==1,0,_n) 
drop if dup>1
drop dup
sort pgm_sys_id
save "$rootpath/processed_data/new_master_file.dta", replace
 
 
clear
use "$rootpath/processed_data/new_master_file.dta"

expand 76

sort pgm_sys_id
quietly by pgm_sys_id:  gen dup = cond(_N==1,0,_n) 
rename dup time
sort pgm_sys_id time
generate historical_compliance_date = .
replace time = time - 1
gen temp_var1 = mod(time,4)
gen temp_var2 = floor(time/4)
replace historical_compliance_date = 101 + temp_var1 + 100*temp_var2
tab historical_compliance_date
sort pgm_sys_id time
save "$rootpath/processed_data/new_master_file.dta", replace


clear
import delimited "$rootpath/raw_data/ICIS-AIR_INFORMAL_ACTIONS.csv"
gen fine=1
split achieved_date, p("/")
destring achieved_date1, gen(HPVMONTH)
destring achieved_date2, gen(HPVDAY)
destring achieved_date3, gen(HPVYEAR)
drop if achieved_date==""
drop if enf_type_code!="NOV"&enf_type_code!="DAWL"
gen historical_compliance_date = (HPVYEAR - 2000)*100 + floor((HPVMONTH-1)/3) + 1

drop if historical_compliance_date==.
drop if historical_compliance_date>2000
drop if historical_compliance_date<100

sort pgm_sys_id historical_compliance_date

gen nov=0
replace nov = 1 if enf_type_code=="NOV"

gen dawl=0
replace dawl = 1 if enf_type_code=="DAWL"

egen tot_nov = sum(nov), by(pgm_sys_id historical_compliance_date)
egen tot_dawl = sum(dawl), by(pgm_sys_id historical_compliance_date)

sort pgm_sys_id historical_compliance_date
quietly by pgm_sys_id historical_compliance_date:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep pgm_sys_id historical_compliance_date tot_nov tot_dawl
save "$rootpath/processed_data/ICIS_QUARTERLY_NOVS.dta", replace


clear
use "$rootpath/processed_data/new_master_file.dta"
merge 1:1 pgm_sys_id historical_compliance_date using "$rootpath/processed_data/ICIS_QUARTERLY_NOVS.dta"
drop if _merge==2
//here we replace the missing tot_nov and tot_dawl values with zero. 
replace tot_nov = 0 if _merge==1
replace tot_dawl=0 if _merge==1
drop _merge
save "$rootpath/processed_data/new_master_file.dta", replace


//Now we work on stack tests.
clear
import delimited "$rootpath/raw_data/ICIS-AIR_STACK_TESTS.csv"

gen tests=1
gen pass=0
replace pass=1 if air_stack_test_status_desc=="Pass"

split actual_end_date, p("/")
destring actual_end_date1, gen(HPVMONTH)
destring actual_end_date2, gen(HPVDAY)
destring actual_end_date3, gen(HPVYEAR)

gen historical_compliance_date=.
replace historical_compliance_date = (HPVYEAR - 2000)*100 + floor((HPVMONTH-1)/3) + 1
drop if historical_compliance_date==.

sort pgm_sys_id historical_compliance_date
egen quarterly_tests=sum(test), by(pgm_sys_id historical_compliance_date)
egen quarterly_passes=sum(pass), by(pgm_sys_id historical_compliance_date)

sort pgm_sys_id historical_compliance_date
quietly by pgm_sys_id historical_compliance_date:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep pgm_sys_id historical_compliance_date quarterly_tests quarterly_passes
save "$rootpath/processed_data/ICIS_QUARTERLY_TESTS.dta", replace


//Now we merge stack tests file with master working file.
clear
use "$rootpath/processed_data/new_master_file.dta"
merge 1:1 pgm_sys_id historical_compliance_date using "$rootpath/processed_data/ICIS_QUARTERLY_TESTS.dta"
drop if _merge==2
replace quarterly_tests=0 if _merge==1
replace quarterly_passes=0 if _merge==1
drop _merge
save "$rootpath/processed_data/new_master_file.dta", replace


clear
import delimited "$rootpath/raw_data/ICIS-AIR_FORMAL_ACTIONS.csv"
gen fine=1
split settlement_entered_date, p("/")
destring settlement_entered_date1, gen(HPVMONTH)
destring settlement_entered_date2, gen(HPVDAY)
destring settlement_entered_date3, gen(HPVYEAR)

gen historical_compliance_date = (HPVYEAR - 2000)*100 + floor((HPVMONTH-1)/3) + 1
drop if historical_compliance_date==.

sort pgm_sys_id historical_compliance_date
egen quarterly_penalties=sum(penalty_amount), by(pgm_sys_id historical_compliance_date)
egen fines=sum(fine), by(pgm_sys_id historical_compliance_date)

sort pgm_sys_id historical_compliance_date
quietly by pgm_sys_id historical_compliance_date:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep pgm_sys_id historical_compliance_date quarterly_penalties fines
save "$rootpath/processed_data/ICIS_QUARTERLY_FINES.dta", replace


clear
import delimited "$rootpath/raw_data/ICIS-AIR_FORMAL_ACTIONS.csv"

keep if state_epa_flag=="L"
gen fine=1
split settlement_entered_date, p("/")
destring settlement_entered_date1, gen(HPVMONTH)
destring settlement_entered_date2, gen(HPVDAY)
destring settlement_entered_date3, gen(HPVYEAR)

gen historical_compliance_date = (HPVYEAR - 2000)*100 + floor((HPVMONTH-1)/3) + 1
drop if historical_compliance_date==.

sort pgm_sys_id HPVYEAR
egen quarterly_penaltiesnofed=sum(penalty_amount), by(pgm_sys_id HPVYEAR)
egen finesnofed=sum(fine), by(pgm_sys_id HPVYEAR)

sort pgm_sys_id HPVYEAR
quietly by pgm_sys_id HPVYEAR:  gen dup = cond(_N==1,0,_n)
drop if dup>1
rename HPVYEAR year
keep pgm_sys_id year quarterly_penaltiesnofed finesnofed
save "$rootpath/processed_data/ICIS_QUARTERLY_FINES_NOFEDS.dta", replace

clear 
use "$rootpath/processed_data/new_master_file.dta"
merge 1:1 pgm_sys_id historical_compliance_date using "$rootpath/processed_data/ICIS_QUARTERLY_FINES.dta"
drop if _merge==2
replace quarterly_penalties=0 if _merge==1
replace fines=0 if _merge==1
drop _merge
save "$rootpath/processed_data/new_master_file.dta", replace


clear 
import delimited "$rootpath/raw_data/ICIS-AIR_FCES_PCES.csv"
gen full_inspections = 0
gen part_inspections = 0
replace full_inspections=1 if comp_monitor_type_code=="FOO"
replace full_inspections=1 if comp_monitor_type_code=="FFO"

replace part_inspections=1 if comp_monitor_type_code=="PCE"
replace part_inspections=1 if comp_monitor_type_code=="PFF"

split actual_end_date, p("-")
destring actual_end_date1, gen(HPVMONTH)
destring actual_end_date2, gen(HPVDAY)
destring actual_end_date3, gen(HPVYEAR)

gen historical_compliance_date = (HPVYEAR - 2000)*100 + floor((HPVMONTH-1)/3) + 1
drop if historical_compliance_date==.

sort pgm_sys_id historical_compliance_date
egen quarterly_full_inspections=sum(full_inspections), by(pgm_sys_id historical_compliance_date)
egen quarterly_part_inspections=sum(part_inspections), by(pgm_sys_id historical_compliance_date)

sort pgm_sys_id historical_compliance_date
quietly by pgm_sys_id historical_compliance_date:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep pgm_sys_id historical_compliance_date quarterly_full_inspections quarterly_part_inspections
save "$rootpath/processed_data/ICIS_QUARTERLY_INSPECTIONS.dta", replace


clear 
import delimited "$rootpath/raw_data/ICIS-AIR_FCES_PCES.csv"

keep if state_epa_flag=="L"
gen full_inspections = 0
gen part_inspections = 0
replace full_inspections=1 if comp_monitor_type_code=="FOO"
replace full_inspections=1 if comp_monitor_type_code=="FFO"

replace part_inspections=1 if comp_monitor_type_code=="PCE"
replace part_inspections=1 if comp_monitor_type_code=="PFF"

split actual_end_date, p("-")
destring actual_end_date1, gen(HPVMONTH)
destring actual_end_date2, gen(HPVDAY)
destring actual_end_date3, gen(HPVYEAR)

gen historical_compliance_date = (HPVYEAR - 2000)*100 + floor((HPVMONTH-1)/3) + 1
drop if historical_compliance_date==.

sort pgm_sys_id HPVYEAR
egen quarterly_full_inspectionsnofed=sum(full_inspections), by(pgm_sys_id HPVYEAR)
egen quarterly_part_inspectionsnofed=sum(part_inspections), by(pgm_sys_id HPVYEAR)

sort pgm_sys_id HPVYEAR
quietly by pgm_sys_id HPVYEAR:  gen dup = cond(_N==1,0,_n)
drop if dup>1
rename HPVYEAR year
keep pgm_sys_id year quarterly_full_inspectionsnofed quarterly_part_inspectionsnofed
save "$rootpath/processed_data/ICIS_QUARTERLY_INSPECTIONSNOFEDS.dta", replace


clear 
use "$rootpath/processed_data/new_master_file.dta"
merge 1:1 pgm_sys_id historical_compliance_date using "$rootpath/processed_data/ICIS_QUARTERLY_INSPECTIONS.dta"
drop if _merge==2
replace quarterly_full_inspections=0 if _merge==1
replace quarterly_part_inspections=0 if _merge==1
drop _merge
save "$rootpath/processed_data/new_master_file.dta", replace


clear
import delimited "$rootpath/raw_data/ICIS-AIR_VIOLATION_HISTORY.csv" 
drop if enf_response_policy_code!="HPV"
split hpv_dayzero_date, p("-")
split hpv_resolved_date, p("-")


destring hpv_dayzero_date1, gen(monthbegin)
destring hpv_dayzero_date3, gen(yearbegin)
destring hpv_dayzero_date2, gen(daybegin)
//Day Resolved is the date at which the firm was no longer a priority violator. 
destring hpv_resolved_date1, gen(monthend)
destring hpv_resolved_date2, gen(dayend)
destring hpv_resolved_date3, gen(yearend)
drop  if yearend==.&yearbegin==.

replace monthend=12 if yearend==.
replace yearend=2019 if yearend==.
drop if yearend<2001
replace yearend=2019 if yearend>2019
drop if yearbegin>2019

gen time_end = (yearend - 2001)*4 + floor((monthend-1)/3) + 1

replace monthbegin = 1 if yearbegin<2001
replace yearbegin = 2001 if yearbegin<2001

gen time_begin = (yearbegin - 2001)*4 + floor((monthbegin-1)/3) + 1
drop if time_begin==.

gen problem = 0
replace problem=1 if time_begin>time_end
gen temp = time_end
replace time_end = time_begin if problem==1
replace time_begin= temp if problem==1
drop temp

gen HPV=1

keep pgm_sys_id time_begin time_end HPV
gen diff = time_end - time_begin
gen dif2 = diff + 1
gen id=_n
expand dif2

sort id
quietly by id:  gen dup = cond(_N==1,0,_n)
drop if dup>1&diff==0
replace dup=1 if dup==0
gen time = time_begin + dup - 1

sort pgm_sys_id time
egen tot_hpv = sum(HPV), by(pgm_sys_id time)

egen mean_hpv_time = mean(dif2), by(pgm_sys_id time)

keep pgm_sys_id time tot_hpv mean_hpv_time
gen historical_compliance_date = .

gen temp_var1 = mod(time,4)
gen temp_var2 = floor(time/4)

replace historical_compliance_date = 101 + temp_var1 + 100*temp_var2 - 1 

replace historical_compliance_date=1904 if historical_compliance_date==2000
replace historical_compliance_date=1804 if historical_compliance_date==1900
replace historical_compliance_date=1704 if historical_compliance_date==1800
replace historical_compliance_date=1604 if historical_compliance_date==1700
replace historical_compliance_date=1504 if historical_compliance_date==1600
replace historical_compliance_date=1404 if historical_compliance_date==1500
replace historical_compliance_date=1304 if historical_compliance_date==1400
replace historical_compliance_date=1204 if historical_compliance_date==1300
replace historical_compliance_date=1104 if historical_compliance_date==1200
replace historical_compliance_date=1004 if historical_compliance_date==1100
replace historical_compliance_date=904 if historical_compliance_date==1000
replace historical_compliance_date=804 if historical_compliance_date==900
replace historical_compliance_date=704 if historical_compliance_date==800

replace historical_compliance_date=604 if historical_compliance_date==700
replace historical_compliance_date=504 if historical_compliance_date==600
replace historical_compliance_date=404 if historical_compliance_date==500
replace historical_compliance_date=304 if historical_compliance_date==400
replace historical_compliance_date=204 if historical_compliance_date==300
replace historical_compliance_date=104 if historical_compliance_date==200

drop temp_var1 temp_var2 

sort pgm_sys_id time
quietly by pgm_sys_id time:  gen dup = cond(_N==1,0,_n)
drop if dup>1 
drop dup

keep pgm_sys_id tot_hpv historical_compliance_date mean_hpv_time
save "$rootpath/processed_data/ICIS_QUARTERLY_VIOLATIONS_HPV.dta", replace


clear
import delimited "$rootpath/raw_data/ICIS-AIR_VIOLATION_HISTORY.csv"
drop if enf_response_policy_code!="FRV"
split earliest_frv_determ_date, p("-")
replace hpv_resolved_date=earliest_frv_determ_date if hpv_resolved_date==""
split hpv_resolved_date, p("-")

destring earliest_frv_determ_date1, gen(monthbegin)
destring earliest_frv_determ_date3, gen(yearbegin)
destring earliest_frv_determ_date2, gen(daybegin) 
destring hpv_resolved_date1, gen(monthend)
destring hpv_resolved_date2, gen(dayend)
destring hpv_resolved_date3, gen(yearend)
drop  if yearend==.&yearbegin==.

replace monthend=12 if yearend==.
replace yearend=2019 if yearend==.
drop if yearend<2001
replace yearend=2019 if yearend>2019
drop if yearbegin>2019

gen time_end = (yearend - 2001)*4 + floor((monthend-1)/3) + 1

replace monthbegin = 1 if yearbegin<2001
replace yearbegin = 2001 if yearbegin<2001

gen time_begin = (yearbegin - 2001)*4 + floor((monthbegin-1)/3) + 1
drop if time_begin==.

gen problem = 0
replace problem=1 if time_begin>time_end
gen temp = time_end
replace time_end = time_begin if problem==1
replace time_begin= temp if problem==1
drop temp

gen FRV=1

gen diff = time_end - time_begin 
gen dif2 = diff + 1
gen id=_n
expand dif2

sort id
quietly by id:  gen dup = cond(_N==1,0,_n)
drop if dup>1&diff==0
replace dup=1 if dup==0
gen time = time_begin + dup - 1
drop dup

sort pgm_sys_id time
egen tot_frv = sum(FRV), by(pgm_sys_id time)


gen historical_compliance_date = .

gen temp_var1 = mod(time,4)
gen temp_var2 = floor(time/4)

replace historical_compliance_date = 101 + temp_var1 + 100*temp_var2 - 1

replace historical_compliance_date=1904 if historical_compliance_date==2000
replace historical_compliance_date=1804 if historical_compliance_date==1900
replace historical_compliance_date=1704 if historical_compliance_date==1800
replace historical_compliance_date=1604 if historical_compliance_date==1700
replace historical_compliance_date=1504 if historical_compliance_date==1600
replace historical_compliance_date=1404 if historical_compliance_date==1500
replace historical_compliance_date=1304 if historical_compliance_date==1400
replace historical_compliance_date=1204 if historical_compliance_date==1300
replace historical_compliance_date=1104 if historical_compliance_date==1200
replace historical_compliance_date=1004 if historical_compliance_date==1100
replace historical_compliance_date=904 if historical_compliance_date==1000
replace historical_compliance_date=804 if historical_compliance_date==900
replace historical_compliance_date=704 if historical_compliance_date==800

replace historical_compliance_date=604 if historical_compliance_date==700
replace historical_compliance_date=504 if historical_compliance_date==600
replace historical_compliance_date=404 if historical_compliance_date==500
replace historical_compliance_date=304 if historical_compliance_date==400
replace historical_compliance_date=204 if historical_compliance_date==300
replace historical_compliance_date=104 if historical_compliance_date==200

drop temp_var1 temp_var2 

sort pgm_sys_id time
quietly by pgm_sys_id time:  gen dup = cond(_N==1,0,_n)
drop if dup>1 
drop dup

keep pgm_sys_id tot_frv historical_compliance_date
save "$rootpath/processed_data/ICIS_QUARTERLY_VIOLATIONS_FRV.dta", replace


//Merge HPV and FRV violations
clear 
use "$rootpath/processed_data/ICIS_QUARTERLY_VIOLATIONS_HPV.dta"

merge 1:1 pgm_sys_id  historical_compliance_date using "$rootpath/processed_data/ICIS_QUARTERLY_VIOLATIONS_FRV.dta"

replace tot_frv = 0 if tot_frv==.
replace tot_hpv = 0 if tot_hpv==.

sort pgm_sys_id historical_compliance_date
quietly by pgm_sys_id historical_compliance_date:  gen dup = cond(_N==1,0,_n)
drop if dup>1
drop dup
drop _merge
save "$rootpath/processed_data/ICIS_QUARTERLY_TOTAL_VIOLATIONS.dta", replace


//Merge Violations with the main working file
clear
use "$rootpath/processed_data/new_master_file.dta"
sort pgm_sys_id historical_compliance_date
merge 1:1 pgm_sys_id historical_compliance_date using "$rootpath/processed_data/ICIS_QUARTERLY_TOTAL_VIOLATIONS.dta"
drop _merge
replace tot_frv = 0 if tot_frv==.
replace tot_hpv = 0 if tot_hpv==.

drop if frsnumber==.
sort frsnumber historical_compliance_date
quietly by frsnumber historical_compliance_date:  gen dup = cond(_N==1,0,_n)

foreach var of varlist  tot_nov tot_dawl quarterly_tests quarterly_passes quarterly_penalties fines quarterly_full_inspections quarterly_part_inspections tot_hpv tot_frv {
egen `var'2 = sum(`var'), by(frsnumber historical_compliance_date)
}

drop tot_nov - tot_frv
drop if dup>1
drop dup

sort state, stable
tabulate state, gen(s)


sort frsnumber historical_compliance_date
gen open = 0
replace open = 1 if air_operating_status_code=="OPR"
gen active = 0
replace active = 1 if tot_nov2!=0|tot_dawl2!=0|quarterly_tests2!=0|quarterly_passes2!=0|quarterly_penalties2!=0|fines2!=0|quarterly_full_inspections2!=0|quarterly_part_inspections2!=0|tot_hpv2!=0|tot_frv2!=0

sort frsnumber historical_compliance_date active
gen incidence_date=historical_compliance_date if open==0&active==1
egen max_incidence_date = max(incidence_date), by(frsnumber)
replace max_incidence_date=99999 if max_incidence_date==.
gen closing_date = max_incidence_date +500
replace open = 1 if closing_date>1904&closing_date<99998
tab closing_date
//If closing_date is 601 the plant has not been active since then, and we delete all observations after 601
gen drop_plant = 0
replace drop_plant = 1 if historical_compliance_date>closing_date
tab drop_plant if open==1 //yields only drop_plant==0, so we wont delete any operating plants
tab drop_plant if open==0 //yields both o and 1 
drop if drop_plant==1&open==0
drop open-drop_plant
saveold "$rootpath/processed_data/final_plant_master_file.dta", replace version(13)


//Since pollution damage file is in years, we need to transform facilties file
clear 
use "$rootpath/processed_data/final_plant_master_file.dta"

gen year = floor(historical_compliance_date/100)
rename quarterly_full_inspections2 full_inspec
rename quarterly_part_inspections2 part_inspec

foreach x of varlist tot_nov2 tot_dawl2 quarterly_tests2 quarterly_passes2 quarterly_penalties2 fines2 full_inspec part_inspec tot_hpv2 tot_frv2 {

egen annual_`x'  = sum(`x'), by(frsnumber year)

}

replace year = 2000+year
sort frsnumber year
quietly by frsnumber year:  gen dup = cond(_N==1,0,_n)
drop if dup>1
drop dup
save "$rootpath/processed_data/final_annual_plant_file.dta", replace

//Here we look into the pollution records.
//NEI plants 
clear
import delimited "$rootpath/raw_data/POLL_RPT_COMBINED_EMISSIONS.csv"
keep if pgm_sys_acrnm=="EIS"
gen criteria_pollutant=0
replace criteria_pollutant=1 if pollutant_name=="Carbon monoxide"
replace criteria_pollutant=1 if pollutant_name=="Lead"
replace criteria_pollutant=1 if regexm(pollutant_name, "Lead compounds")==1
replace criteria_pollutant=1 if pollutant_name=="Lead and compounds"
replace criteria_pollutant=1 if pollutant_name=="Nitrogen oxides"
replace criteria_pollutant=1 if regexm(pollutant_name, "Primary PM")==1
replace criteria_pollutant=1 if pollutant_name=="Sulfur dioxide"
replace criteria_pollutant=1 if pollutant_name=="Volatile organic compounds"
replace criteria_pollutant=1 if pollutant_name=="Ammonia"

gen other_pollutant=0
replace other_pollutant = 1 if pollutant_name=="Ozone"

drop if other_pollutant==0&criteria_pollutant==0
tab unit_of_measure
replace annual_emission = annual_emission/2204.62 if unit_of_measure=="Pounds"
rename reporting_year year

gen CO_tons = 0
replace CO_tons = annual_emission if pollutant_name=="Carbon monoxide"

gen PM10_tons = 0
replace PM10_tons = annual_emission if regexm(pollutant_name, "Primary PM10")==1

gen PM25_tons = 0
replace PM25_tons = annual_emission if regexm(pollutant_name, "Primary PM10")!=1&regexm(pollutant_name, "Primary PM")==1

gen SO2_tons = 0
replace SO2_tons = annual_emission if pollutant_name=="Sulfur dioxide"

gen NOX_tons = 0
replace NOX_tons = annual_emission if pollutant_name=="Nitrogen oxides"

gen VOC_tons = 0
replace VOC_tons = annual_emission if pollutant_name=="Volatile organic compounds"

gen Lead_tons = 0 
replace Lead_tons = annual_emission if regexm(pollutant_name, "Lead compounds")==1 | pollutant_name == "Lead" | pollutant_name == "Lead and compounds"

gen NH3_tons = 0
replace NH3_tons = annual_emission if pollutant_name=="Ammonia"

sort pgm_sys_id year
egen CO_annual = sum(CO_tons), by(pgm_sys_id year)
egen PM10_annual = sum(PM10_tons), by(pgm_sys_id year)
egen PM25_annual = sum(PM25_tons), by(pgm_sys_id year)
egen SO2_annual = sum(SO2_tons), by(pgm_sys_id year)
egen NOX_annual = sum(NOX_tons), by(pgm_sys_id year)
egen VOC_annual = sum(VOC_tons), by(pgm_sys_id year)
egen PB_annual = sum(Lead_tons), by(pgm_sys_id year)
egen NH3_annual = sum(NH3_tons), by(pgm_sys_id year)
egen Ozone_annual = sum(other_pollutant), by(pgm_sys_id year)

sort pgm_sys_id year
quietly by pgm_sys_id year:  gen dup = cond(_N==1,0,_n)
drop if dup>1 
drop dup

keep registry_id year pgm_sys_id CO_annual - Ozone_annual
saveold "$rootpath/processed_data/NEI_plant_pollution_file_w_NH3.dta", replace version(13)


//From EPA's FRS registry website we obtained FRS_LINK_FILE to cross match EIS to ICIS plants
clear 
import delimited "$rootpath/raw_data/FRS_PROGRAM_LINKS.csv"
sort pgm_sys_acrnm
keep if pgm_sys_acrnm=="EIS"
sort registry_id pgm_sys_id
by registry_id pgm_sys_id: gen dup = cond(_N==1, 0, _n)
drop if dup>1
drop dup
rename pgm_sys_id eis_id
save "$rootpath/processed_data/FRS_EIS_ONLY.dta", replace

clear
import delimited "$rootpath/raw_data/FRS_PROGRAM_LINKS.csv"
sort pgm_sys_acrnm
keep if pgm_sys_acrnm=="AIR"
sort registry_id
by registry_id: gen dup = cond(_N==1, 0, _n)
drop if dup>1
drop dup
merge 1:m registry_id using "$rootpath/processed_data/FRS_EIS_ONLY.dta"
keep pgm_sys_id registry_id eis_id fips_code
sort registry_id eis_id
by registry_id eis_id: gen dup = cond(_N==1, 0, _n)
drop if dup>1
drop dup
rename registry_id frsnumber
drop if pgm_sys_id==""
save "$rootpath/processed_data/FRS_EIS_AIR.dta", replace

clear
import delimited "$rootpath/raw_data/ZIP-COUNTY-FIPS.csv", stringcols(1) 
rename zip zip_code
rename stcountyfp fips
drop classfp
sort zip_code
qui by zip_code: gen dup = cond(_N==1, 0, _n)
tab dup
drop if dup>1
drop dup
save "$rootpath/processed_data/fips.dta", replace

clear 
use "$rootpath/processed_data/fips.dta"
gen fips_2dgt = 0
replace fips_2dgt = floor(fips/1000)
keep state fips_2dgt
sort state fips_2dgt
by state fips_2dgt: gen dup=cond(_N==1, 0, _n)
drop if dup>1 
drop dup
save "$rootpath/processed_data/2_digit_fips.dta", replace

clear
use "$rootpath/processed_data/NEI_plant_pollution_file_w_NH3.dta" 
rename pgm_sys_id eis_id
rename registry_id frsnumber
sort frsnumber eis_id
merge m:1 frsnumber eis_id using "$rootpath/processed_data/FRS_EIS_AIR.dta"
keep if _merge==3 
drop _merge
sort frsnumber pgm_sys_id year
by frsnumber pgm_sys_id year: gen dup = cond(_N==1, 0, _n)
drop if dup>1
drop dup
merge m:1 frsnumber year using "$rootpath/processed_data/final_annual_plant_file.dta"
drop if _merge!=3
drop _merge

//Now we need to obtain fips to merge with MD file
//The following function is to get rid of all non-numeric values from fips_code
//ssc inst egenmore
egen fips1 = sieve(fips_code), keep(numeric) 
destring fips1, replace
merge m:1 state using "$rootpath/processed_data/2_digit_fips.dta"
keep if _merge==3
drop _merge
gen fips = 0
replace fips = fips_2dgt*1000 + fips1
drop if fips==.
save "$rootpath/processed_data/NEI_pollution_damage_w_NH3.dta", replace


//Using MDs file we estimate the pollution damages
clear 
import delimited "$rootpath/raw_data/MD_Panel_2014_AP3.csv"
rename ïfips fips
sort fips
tab category
drop if category!="Low Stack" //The "Low Stack" category we go with
drop count category
save "$rootpath/processed_data/MD_AP3_2014.dta", replace


clear
use "$rootpath/processed_data/NEI_pollution_damage_w_NH3.dta"
merge m:1 fips using "$rootpath/processed_data/MD_AP3_2014.dta"
drop if _merge==2
gen NOX_damage = NOX_annual*nox
gen PM25_damage = PM25_annual*pm25
gen SO2_damage = SO2_annual*so2
gen VOC_damage = VOC_annual*voc
gen NH3_damage = NH3_annual*nh3

sort frsnumber year
egen NOXdamage_nei = sum(NOX_damage), by(frsnumber year)
egen PM25damage_nei = sum(PM25_damage), by(frsnumber year)
egen SO2damage_nei = sum(SO2_damage), by(frsnumber year)
egen VOCdamage_nei = sum(VOC_damage), by(frsnumber year)
egen NH3damage_nei = sum(NH3_damage), by(frsnumber year)

sort frsnumber year
quietly by frsnumber year:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep frsnumber fips year nh3-voc NOXdamage_nei-NH3damage_nei
save "$rootpath/processed_data/NEI_pollution_damage_w_NH3.dta", replace

//TRI plants RPT data
clear
import excel "$rootpath/raw_data/Copy of tri chemical classification into CAAA pollutants for AERPP paper.xls", sheet("AERPP") firstrow
rename CASnumber cascompoundid
sort TRIchemicalname
by TRIchemicalname: gen dup = cond(_N==1, 0, _n)
drop if dup>1
drop dup
replace TRIchemicalname = "STYRENE" if TRIchemicalname=="STYRENE (VINYLBENZENE)"
replace TRIchemicalname = "TOLUENE" if TRIchemicalname=="TOLUENE (METHYL BENZENE)"
replace TRIchemicalname = "XYLENE" if TRIchemicalname=="XYLENE (MIXED ISOMERS)"
replace TRIchemicalname = "CERTAIN GLYCOL ETHERS, A LIMITED SET" if TRIchemicalname=="CERTAIN GLYCOL ETHERS"
save "$rootpath/processed_data/TRI_CHEM_POLL_MAP.dta", replace

//Getting as many as possible HAP/CAP pollutant categories as possible
clear
import excel "$rootpath/raw_data/tri-nei-crosswalk.xlsx", sheet("NEI to TRI Crosswalk") firstrow allstring
drop if Overlap_with_TRI=="No"
sort NEIpollutantcode
by NEIpollutantcode: gen dup = cond(_N==1, 0, _n)
drop if dup>1 
drop dup
sort TRIChemicalName NEIpollutantcode
by TRIChemicalName NEIpollutantcode: gen dup = cond(_N==1, 0, _n)
drop if dup>1 
drop dup
save temp, replace

//from tri to nei
clear
import excel "$rootpath/raw_data/tri-nei-crosswalk.xlsx", sheet("TRI to NEI Crosswalk") firstrow allstring
sort TRIChemicalCodenodash TRIChemicalName
by TRIChemicalCodenodash TRIChemicalName: gen dup = cond(_N==1, 0, _n)
drop if dup>1 
drop dup
merge 1:m TRIChemicalName using temp
keep if NEIPollutantType=="CAP"|NEIPollutantType=="HAP"|NEIPollutantType=="CAP/HAP"
drop if _merge==2
drop _merge
rename TRIChemicalCodenodash cascompoundid
sort cascompoundid
by cascompoundid: gen dup = cond(_N==1, 0, _n)
drop if dup>1 
drop dup
keep cascompoundid TRIChemicalName NEIPollutantCode NEIPollutantCodeDescription NEIPollutantType NEIPollutantCategoryName MultipliertogofromNEItoTRI
save "$rootpath/processed_data/tri_pollutant_class_final.dta", replace 


clear
import delimited "$rootpath/raw_data/POLL_RPT_COMBINED_EMISSIONS.csv"
keep if pgm_sys_acrnm=="TRIS"
replace pollutant_name = usubinstr(pollutant_name, "--", "", .)
replace pollutant_name = usubinstr(pollutant_name, "TRI", "", .)
replace pollutant_name = usubinstr(pollutant_name, "  ", "", .)
gen TRIchemicalname = upper(pollutant_name)
merge m:1 TRIchemicalname using "$rootpath/processed_data/TRI_CHEM_POLL_MAP.dta"
tab TRIchemicalname if _merge==2
drop if _merge==2
drop _merge
merge m:1 cascompoundid using "$rootpath/processed_data/tri_pollutant_class_final.dta"
drop if _merge==2
drop _merge
destring VOC, replace
destring TSP, replace
destring Lead, replace
destring Other, replace
gen criteria_pollutant=0
replace criteria_pollutant=1 if pollutant_name=="Carbon monoxide"
replace criteria_pollutant=1 if pollutant_name=="Lead"
replace criteria_pollutant=1 if regexm(pollutant_name, "Lead compounds")==1
replace criteria_pollutant=1 if pollutant_name=="Lead and compounds"
replace criteria_pollutant=1 if pollutant_name=="Nitrogen oxides"
replace criteria_pollutant=1 if regexm(pollutant_name, "Primary PM")==1
replace criteria_pollutant=1 if pollutant_name=="Sulfur dioxide"
replace criteria_pollutant=1 if pollutant_name=="Volatile organic compounds"
replace criteria_pollutant=1 if Lead==1
replace criteria_pollutant=1 if VOC==1
replace criteria_pollutant=1 if TSP==1

gen other_pollutant=0
replace other_pollutant = 1 if pollutant_name=="Ozone"
replace other_pollutant = 1 if Other==1&other_pollutant==0

drop if other_pollutant==0&criteria_pollutant==0
tab unit_of_measure
replace annual_emission = annual_emission/2204.62 if unit_of_measure=="Pounds"
rename reporting_year year

gen PM_tons = 0
replace PM_tons = annual_emission if TSP==1

gen VOC_tons = 0
replace VOC_tons = annual_emission if pollutant_name=="Volatile organic compounds"
replace VOC_tons = annual_emission if VOC==1&VOC_tons ==0

gen Lead_tons = 0 
replace Lead_tons = annual_emission if regexm(pollutant_name, "Lead compounds")==1 | pollutant_name == "Lead" | pollutant_name == "Lead and compounds"
replace Lead_tons = annual_emission if Lead==1&Lead_tons ==0

gen NH3_tons = 0
replace NH3_tons = annual_emission if pollutant_name=="Ammonia"

gen HAP_tons = 0
replace HAP_tons = annual_emission if HAP==1
replace HAP_tons=annual_emission if NEIPollutantType=="HAP"

gen other_tons = 0
replace other_tons = annual_emission if Other==1

sort pgm_sys_id year
egen PM25_annual = sum(PM_tons), by(pgm_sys_id year)
egen VOC_annual = sum(VOC_tons), by(pgm_sys_id year)
egen PB_annual = sum(Lead_tons), by(pgm_sys_id year)
egen NH3_annual = sum(NH3_tons), by(pgm_sys_id year)
egen HAP_annual = sum(HAP_tons), by(pgm_sys_id year)
egen Other_annual = sum(other_pollutant), by(pgm_sys_id year)
sort pgm_sys_id year
quietly by pgm_sys_id year:  gen dup = cond(_N==1,0,_n)
drop if dup>1 
drop dup

keep registry_id year pgm_sys_id PM25_annual - Other_annual
summarize PM25_annual-Other_annual, detail
saveold "$rootpath/processed_data/TRI_plant_pollution_file_w_NH3.dta", replace version(13)


clear 
import delimited "$rootpath/raw_data/FRS_PROGRAM_LINKS.csv"
sort pgm_sys_acrnm
keep if pgm_sys_acrnm=="TRIS"
sort registry_id pgm_sys_id
by registry_id pgm_sys_id: gen dup = cond(_N==1, 0, _n)
drop if dup>1
drop dup
rename pgm_sys_id tris_id
save "$rootpath/processed_data/FRS_TRIS_ONLY.dta", replace

clear
import delimited "$rootpath/raw_data/FRS_PROGRAM_LINKS.csv"
sort pgm_sys_acrnm
keep if pgm_sys_acrnm=="AIR"
sort registry_id
by registry_id: gen dup = cond(_N==1, 0, _n)
drop if dup>1
drop dup
merge 1:m registry_id using "$rootpath/processed_data/FRS_TRIS_ONLY.dta"
keep pgm_sys_id registry_id tris_id fips_code
sort registry_id tris_id
by registry_id tris_id: gen dup = cond(_N==1, 0, _n)
drop if dup>1
drop dup
rename registry_id frsnumber
drop if pgm_sys_id==""
save "$rootpath/processed_data/FRS_TRIS_AIR.dta", replace

clear
use "$rootpath/processed_data/TRI_plant_pollution_file_w_NH3.dta" //These plants are from RPT file
rename pgm_sys_id tris_id
rename registry_id frsnumber
sort frsnumber tris_id
merge m:1 frsnumber tris_id using "$rootpath/processed_data/FRS_TRIS_AIR.dta"
keep if _merge==3 
drop _merge
sort frsnumber pgm_sys_id year
by frsnumber pgm_sys_id year: gen dup = cond(_N==1, 0, _n)
drop if dup>1
drop dup
merge 1:1 frsnumber year using "$rootpath/processed_data/final_annual_plant_file.dta"
drop if _merge!=3
drop _merge

//Now we need to obtain fips to merge with MD file
//ssc inst egenmore
//The following function is to get rid og all non-numeric values from fips_code
egen fips1 = sieve(fips_code), keep(numeric) 
destring fips1, replace
merge m:1 state using "$rootpath/processed_data/2_digit_fips.dta"
keep if _merge==3
drop _merge
gen fips = 0
replace fips = fips_2dgt*1000 + fips1
merge m:1 fips using "$rootpath/processed_data/MD_AP3_2014.dta"
drop if _merge==2
gen PM25_damage = PM25_annual*pm25
gen VOC_damage = VOC_annual*voc
gen NH3_damage = NH3_annual*nh3
gen PB_damage = PB_annual*907185*1100 
sort frsnumber year
egen PM25damage = sum(PM25_damage), by(frsnumber year)
egen VOCdamage = sum(VOC_damage), by(frsnumber year)
egen NH3damage = sum(NH3_damage), by(frsnumber year)
egen PBdamage = sum(PB_damage), by(frsnumber year)
egen Other_annual2 = sum(Other_annual), by(frsnumber year)
sort frsnumber year
quietly by frsnumber year:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep frsnumber fips year nh3-voc PM25_annual VOC_annual NH3_annual PB_annual PM25damage-PBdamage HAP_annual Other_annual2
save "$rootpath/processed_data/TRI_pollution_damage_w_NH3.dta", replace


clear
import delimited "$rootpath/raw_data/AFS_ACTIONS.csv"
drop if date_achieved<20040000
//drop if date_achieved<20100000
drop if all_violation_type_codes==""
//General criteria: DIS, GC1-2,3,4,5,6,7,8,9, 10, M1a,b,c, M2a,b,c,, M3a,b,c,d,e,f, M4a,b,c,d,e,f 
keep if regexm(all_violation_type_codes, "G")==1|regexm(all_violation_type_codes, "M")==1

gen transfered_code = 0

replace transfered_code = 1 if regexm(all_violation_type_codes, "M")!=1&regexm(all_violation_type_codes, "GC4")!=1&regexm(all_violation_type_codes, "GC5")!=1&regexm(all_violation_type_codes, "GC6")!=1

save "$rootpath/processed_data/kept_actionsVSEP23.dta", replace


use "$rootpath/processed_data/kept_actionsVSEP23.dta", clear
gen year=floor(date_achieved/10000)
sort afs_id year
egen min_t_code=min(transfered_code), by(afs_id year)

quietly by  afs_id year:  gen dup = cond(_N==1,0,_n)
drop if dup>1
keep afs_id year  min_t_code
save "$rootpath/processed_data/unkept_actionsVSEP23.dta", replace



clear
import delimited "$rootpath/raw_data/AFS_FACILITIES.csv"
keep plant_id state state_number
sort plant_id
save "$rootpath/processed_data/plant_state_mapping.dta", replace

clear
use "$rootpath/processed_data/kept_actionsVSEP23.dta"
merge m:1 plant_id using "$rootpath/processed_data/plant_state_mapping.dta"
drop if _merge!=3
egen avg_old_code_prob = mean(transfered_code), by(state_number)
replace avg_old_code_prob  = 1 - avg_old_code_prob 

sort state_number
quietly by state_number:  gen dup = cond(_N==1,0,_n) 
tab avg_old_code_prob if dup<2

drop if state=="PR"|state=="DC"|state=="VI"
drop if dup>1
sort avg_old_code_prob
gen rank = _n
keep state state_number avg_old_code_prob rank transfered_code
sort state
save "$rootpath/processed_data/state_resource_mappingVSEP23.dta", replace


clear
insheet using "$rootpath/raw_data/AIR_PROGRAM.csv"
keep plant_id afs_id
sort afs_id
sort afs_id plant_id
//Deleting the duplicates. 
quietly by afs_id plant_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1
drop dup
save "$rootpath/processed_data/AFSPROGRAMV22.dta", replace

clear
// Two data sets taken at different times. Keep more recent one if there are multiples ;
insheet using "$rootpath/raw_data/FRS_PROGRAM_LINKSV2.csv"
// Only looking at air pollution ;
keep if pgm_sys_acrnm=="AIRS/AFS"
rename registry_id frsnumber
format %14.0f frsnumber
rename pgm_sys_id afs_id
drop pgm_sys_acrnm

sort afs_id
merge 1:1 afs_id using "$rootpath/processed_data/AFSPROGRAMV22.dta"
drop if _merge!=3
sort plant_id 
quietly by plant_id:  gen dup = cond(_N==1,0,_n) 
drop if dup>1
drop dup
sort frsnumber
quietly by frsnumber:  gen dup = cond(_N==1,0,_n) 
drop if dup>1
drop dup
keep frsnumber afs_id plant_id
save "$rootpath/processed_data/FRS_AFS_LINK.dta", replace


clear
use "$rootpath/processed_data/final_annual_plant_file.dta"
sort state, stable
merge m:1 state using "$rootpath/processed_data/state_resource_mappingVSEP23.dta"

drop if _merge==2
drop _merge
sort frsnumber, stable
merge m:1 frsnumber using "$rootpath/processed_data/FRS_AFS_LINK.dta"
drop _merge
sort afs_id year, stable
merge m:1 afs_id year using "$rootpath/processed_data/unkept_actionsVSEP23.dta"
drop if _merge==2
drop _merge
sort frsnumber year, stable
merge 1:1 frsnumber year using "$rootpath/processed_data/TRI_pollution_damage_w_NH3.dta"
drop if _merge==2
egen max_merge=max(_merge), by(frsnumber)
drop if max_merge==1

gen tot_damage=NH3damage+PM25damage+VOCdamage+PBdamage

gen tot_damage2=PM25damage+VOCdamage+NH3damage
//This is to deal with the 2000 to 2014 CPI conversion
replace tot_damage=tot_damage*1.39
replace tot_damage2=tot_damage2*1.39
gen compliance=1
replace compliance=0 if annual_tot_hpv2>0|annual_tot_frv2>0
drop _merge
sort year, stable

merge m:1 year using "$rootpath/processed_data/Inflation_Data_Mar20.dta"  //constructed by hand using CPI
drop if _merge==2
drop if _merge==1
///Note, since it was in 2014 dollars above with the 1.39, we need to use a base of 
//2014, then multiply by 1.06 to bring up to 2018. 
//replace tot_damage=tot_damage/inflation_2014
replace tot_damage=(1.06*tot_damage)
//replace tot_damage2=tot_damage2/inflation_2014
replace tot_damage2=(1.06*tot_damage2)

gen log_damage2=log(tot_damage2+1)

//This is a May 2022 edit. 
gen tot_crit_emissions=PM25_annual+VOC_annual+NH3_annual
gen log_crit_emissions=log(tot_crit_emissions+1)
gen log_hap_emissions=log(HAP_annual+1)

gen tot_emissions=Other_annual2+HAP_annual+PM25_annual+NH3_annual+VOC_annual+PB_annual
gen log_tot_emissions=log(tot_emissions+1)
drop if state=="PR"|state=="DC"|state=="VI"
gen post=0
replace post=1 if year>2014

gen treat=0
replace treat=avg_old_code_prob

gen treatXpost=treat*post

sort frsnumber year, stable
quietly by frsnumber  year:  gen dup = cond(_N==1,0,_n)
 
drop if state[_n]!=state[_n+1]&frsnumber == frsnumber[_n+1]
//repeat this next line until there are no more changes to the data.
drop if state[_n]!=state[_n+1]&frsnumber== frsnumber[_n+1]
drop if state[_n]!=state[_n+1]&frsnumber== frsnumber[_n+1]
 
tabulate state, gen(s_)

gen state_id=.
foreach x of numlist 1/51 {
replace state_id = `x' if s_`x'==1&s_`x'!=.
}
xtset frsnumber year
gen log_damages=log(tot_damage+1)

sort tot_damage, stable
egen p97 = pctile(tot_damage2), p(97.5)
egen p03 = pctile(tot_damage2), p(2.5)

split naics_codes, p(" ")
destring naics_codes1 , gen(naics) force
gen n4 = floor(naics/100)
tabulate n4, gen(n_4_code)

forvalues d=1/171 {

replace n_4_code`d' = year - 2007 if n_4_code`d'>0&n_4_code`d'!=.
}

xtset frsnumber year

gen treat1=.
replace treat1=1 if rank>42
replace treat1=0 if rank<13
gen treat1Xpost = treat1*post

gen constant=1
egen tot_constant=sum(constant), by(frsnumber)
drop _merge
save "$rootpath/processed_data/working_file_sep2023.dta", replace


********************************************************************************
//SRF_results data transformation
///This is for converting the state names in the SRF later. 
clear
import delimited "$rootpath/raw_data/State_Name_Mapping.csv", varnames(1) encoding (UTF-8) clear 
saveold "$rootpath/processed_data/state_name_matching.dta", version(12) replace

clear
import excel "$rootpath/raw_data/SRF_round1_2.xlsx", sheet("Sheet1") firstrow
foreach var of varlist dataproblem_1 - violationproblem_2 {
	replace `var' = 0 if `var'==.
}

replace dataproblem_1 = 1 if dataproblem_1==1|dataproblem_2==1
replace enforcementproblem_1 = 1 if enforcementproblem_1==1|enforcementproblem_2==1
replace inspectionproblem_1 = 1 if inspectionproblem_1==1|inspectionproblem_2==1
replace penaltyproblem_1 = 1 if penaltyproblem_1==1|penaltyproblem_2==1
replace violationproblem_1 = 1 if violationproblem_1==1|violationproblem_2==1

keep state - violationproblem_1
rename state state_two
sort state_two
saveold "$rootpath/processed_data/SRF_Results.dta", version(12) replace


//adding rounds 3 and 4
clear
import delimited "$rootpath/raw_data/table_export.csv", delimiter(comma) bindquote(strict) varnames(1)
merge m:1 state using  "$rootpath/processed_data/state_name_matching.dta" 
drop if _merge==1
drop _merge
sort state_two
merge m:1 state_two using "$rootpath/processed_data/SRF_Results.dta"
drop _merge

//This is for defining whether the state received a problematic result in this area of review. 
gen problem=0
replace problem=1 if findinglevel!=""

replace problem=0 if media!="CAA"
//drop if reviewyear>2014
replace problem=0 if reviewyear>2014

gen data_problem=0
replace data_problem=1 if problem==1&element=="Data"
egen dataproblem=max(data_problem), by(state)
replace dataproblem = 1 if dataproblem_1==1

gen enforcement_problem=0
replace enforcement_problem=1 if problem==1&element=="Enforcement"
egen enforcementproblem=max(enforcement_problem), by(state)
replace enforcementproblem = 1 if enforcementproblem_1==1

gen inspection_problem=0
replace inspection_problem=1 if problem==1&element=="Inspections"
egen inspectionproblem=max(inspection_problem), by(state)
replace inspectionproblem = 1 if inspectionproblem_1==1

gen penalty_problem=0
replace penalty_problem=1 if problem==1&element=="Penalties"

egen penaltyproblem=max(penalty_problem), by(state)
replace penaltyproblem = 1 if penaltyproblem_1==1

gen violation_problem=0
replace violation_problem=1 if problem==1&element=="Violations"

egen violationproblem=max(violation_problem), by(state)
replace violationproblem = 1 if violationproblem_1==1

sort state_two
quietly by state_two:  gen dup = cond(_N==1,0,_n)
drop if dup>1

replace state=state_two

keep state violationproblem penaltyproblem inspectionproblem enforcementproblem dataproblem
saveold "$rootpath/processed_data/SRF_Results.dta", version(12) replace 

********************************************************************************
//This is to map frsnumber to watch list plants, by year
/*
clear
import excel "$rootpath/raw_data/WL_RA_work.xlsx", sheet("sheet1") firstrow

drop if frsnumber == .
duplicates drop
merge 1:m frsnumber using "$rootpath/processed_data/working_file_sep2023.dta"
drop if _merge!=3
keep frsnumber WL_original_id pgm_sys_id
duplicates drop
save "$rootpath/processed_data/wl_map_AUG23.dta", replace


use "$rootpath/processed_data/matched_eight.dta", clear
sort pgm_sys_id
merge 1:m pgm_sys_id using "$rootpath/processed_data/working_file_sep2023.dta"
drop if _merge !=3
keep frsnumber pgm_sys_id WL_original_id
duplicates drop
append using "$rootpath/processed_data/wl_map_AUG23.dta"
duplicates drop

sort frsnumber
quietly by  frsnumber:  gen dup = cond(_N==1,0,_n)
drop if dup>0
drop dup
sort frsnumber
save "$rootpath/processed_data/wl_map_nodups.dta", replace

*/


********************************************************************************
//This is to assemble a mapping from EIS to FRS number
import delimited "$rootpath\raw_data\784044797.CSV", clear 
save "$rootpath\processed_data\eis_id_to_frsnumber1.dta", replace

import delimited "$rootpath\raw_data\784057508.CSV", clear 
append using "$rootpath\processed_data\eis_id_to_frsnumber1.dta"
save "$rootpath\processed_data\eis_id_to_frsnumber1.dta", replace 

import delimited "$rootpath\raw_data\784071048.CSV", clear 
append using "$rootpath\processed_data\eis_id_to_frsnumber1.dta"
save "$rootpath\processed_data\eis_id_to_frsnumber1.dta", replace 

import delimited "$rootpath\raw_data\784083648.CSV", clear 
append using "$rootpath\processed_data\eis_id_to_frsnumber1.dta"
save "$rootpath\processed_data\eis_id_to_frsnumber1.dta", replace 

import delimited "$rootpath\raw_data\784088407.CSV", clear 
append using "$rootpath\processed_data\eis_id_to_frsnumber1.dta"
drop if pgm_sys_acrnm =="ICIS"
rename pgm_sys_id eis_id
rename registry_id frsnumber
save "$rootpath\processed_data\eis_id_to_frsnumber1.dta", replace

********************************************************************************
//This is to add DUNS number to data
import delimited "$rootpath\raw_data\NATIONAL_ORGANIZATION_FILE.CSV"
rename v1 REGISTRY_ID
rename v2 PGM_SYS_ACRNM
rename v3 PGM_SYS_ID
rename v4 INTEREST_TYPE
rename v5 AFFILIATION_TYPE
rename v6 START_DATE
rename v7 END_DATE
rename v8 ORG_NAME
rename v9 ORG_TYPE
rename v10 DUNS_NUMBER
rename v11 DIVISION_NAME
rename v12 PHONE_NUMBER
rename v13 ALTERNATE_PHONE
rename v14 FAX_NUMBER
rename v15 EMAIL_ADDRESS
rename v16 EIN
rename v17 STATE_BUSINESS_ID
rename v18 MAILING_ADDRESS
rename v19 SUPPLEMENTAL_ADDRESS
rename v20 CITY_NAME
rename v21 STATE_CODE
rename v22 STATE_NAME
rename v23 POSTAL_CODE
rename v24 COUNTRY_NAME
drop if _n==1

tab PGM_SYS_ACRNM

///Here I destring the frsnumbers and drop those observations with no FRS number. 
destring REGISTRY_ID, gen(frsnumber) force
drop if frsnumber==.
duplicates drop 
save temp, replace 

gen ORG_NAME2=ORG_NAME if AFFILIATION_TYPE=="PARENT COMPANY"
gen ORG_NAME_indicator=0
replace ORG_NAME_indicator=1 if ORG_NAME2!="" 

gsort frsnumber -ORG_NAME_indicator
replace ORG_NAME2=ORG_NAME2[_n-1] if frsnumber==frsnumber[_n-1]&ORG_NAME2=="" 

egen ORG_NAME3=mode(ORG_NAME2), by(frsnumber)
drop if ORG_NAME3==""
sort ORG_NAME3
quietly by ORG_NAME3: gen dup = cond(_N==1,0,_n)
drop if dup>1
gen ORG_NUM=_n
tostring ORG_NUM, gen(ORG_NUMBER)
keep ORG_NAME3 ORG_NUMBER
rename ORG_NAME3 ORG_NAME
save "$rootpath/processed_data/FRS_NAME_LINK.dta", replace

clear 
use temp
///Here I replace all of the observations where the FRS number has a DUNS_NUMBER at some point that is not observed for that frsnumber elsewhere. That is if plant i has a DUNS_NUMBER at one point I assume plant i has the same DUNS_NUMBER at all times. 
gsort frsnumber -DUNS_NUMBER
replace DUNS_NUMBER="" if DUNS_NUMBER=="000000000"

gen ORG_DUNS = DUNS_NUMBER if AFFILIATION_TYPE=="PARENT COMPANY"
replace ORG_DUNS=ORG_DUNS[_n-1] if frsnumber==frsnumber[_n-1]&ORG_DUNS=="" 
by frsnumber: egen FIRM_DUNS = mode(ORG_DUNS) 
replace FIRM_DUNS = DUNS_NUMBER if FIRM_DUNS==""
browse if FIRM_DUNS==""&DUNS_NUMBER!=""

gsort frsnumber -FIRM_DUNS

sort ORG_NAME
recast str150 ORG_NAME, force
sort ORG_NAME
merge m:1 ORG_NAME using "$rootpath/processed_data/FRS_NAME_LINK.dta"
replace FIRM_DUNS=ORG_NUMBER if FIRM_DUNS==""
gsort frsnumber -FIRM_DUNS
replace FIRM_DUNS=FIRM_DUNS[_n-1] if frsnumber==frsnumber[_n-1]&FIRM_DUNS==""

//Here I drop those observation with no DUNS number and frsnumber==.
drop if FIRM_DUNS == ""
keep if frsnumber!=.

//Now I delete duplicates since I now have a 1-1 mapping between frsnumbers and DUNS_NUMBERS
sort frsnumber
quietly by frsnumber:  gen dup = cond(_N==1,0,_n)
drop if dup>1
keep frsnumber FIRM_DUNS 
save "$rootpath/processed_data/FRS_DUNS_LINK.dta", replace

clear
use "$rootpath/processed_data/final_annual_plant_file.dta"
sort frsnumber
quietly by frsnumber:  gen dup = cond(_N==1,0,_n)
drop if dup>1
drop dup
merge 1:1 frsnumber using "$rootpath/processed_data/FRS_DUNS_LINK.dta"
keep if _merge==3
keep frsnumber FIRM_DUNS
duplicates drop 
save "$rootpath/processed_data/DUNS_merged.dta", replace

clear
use "$rootpath/processed_data/final_annual_plant_file.dta"
sort frsnumber
quietly by frsnumber:  gen dup = cond(_N==1,0,_n)
drop if dup>1
drop dup
merge 1:1 frsnumber using "$rootpath/processed_data/DUNS_merged.dta"
rename _merge _merge_frs
rename facility_name ORG_NAME
merge m:1 ORG_NAME using "$rootpath/processed_data/FRS_NAME_LINK.dta"
keep if _merge_frs==3 | _merge_frs==1&_merge==3
replace FIRM_DUNS=ORG_NAME if FIRM_DUNS==""
keep frsnumber ORG_NAME FIRM_DUNS _merge_frs
duplicates drop
save "$rootpath/processed_data/DUNS_merged.dta", replace

clear
use "$rootpath/processed_data/final_annual_plant_file.dta"
sort frsnumber
quietly by frsnumber:  gen dup = cond(_N==1,0,_n)
drop if dup>1
drop dup
merge 1:1 frsnumber using "$rootpath/processed_data/DUNS_merged.dta"
keep if _merge==1
keep frsnumber facility_name pgm_sys_id 
save "$rootpath/processed_data/DUNS_icis_plants_unmatched_one.dta", replace

clear 
use temp
gsort frsnumber -DUNS_NUMBER
replace DUNS_NUMBER="" if DUNS_NUMBER=="000000000"

gen ORG_DUNS = DUNS_NUMBER if AFFILIATION_TYPE=="PARENT COMPANY"
replace ORG_DUNS=ORG_DUNS[_n-1] if frsnumber==frsnumber[_n-1]&ORG_DUNS=="" 
by frsnumber: egen FIRM_DUNS = mode(ORG_DUNS) 
replace FIRM_DUNS = DUNS_NUMBER if FIRM_DUNS==""
browse if FIRM_DUNS==""&DUNS_NUMBER!=""

gsort frsnumber -FIRM_DUNS

sort ORG_NAME
recast str150 ORG_NAME, force
sort ORG_NAME
merge m:1 ORG_NAME using "$rootpath/processed_data/FRS_NAME_LINK.dta"
replace FIRM_DUNS=ORG_NUMBER if FIRM_DUNS==""
gsort frsnumber -FIRM_DUNS
replace FIRM_DUNS=FIRM_DUNS[_n-1] if frsnumber==frsnumber[_n-1]&FIRM_DUNS==""

//Here I drop those observation with no DUNS number and frsnumber==.
drop if FIRM_DUNS == ""
keep if frsnumber!=.
drop _merge

merge m:1 frsnumber using "$rootpath/processed_data/DUNS_merged.dta"
keep if _merge==1
drop _merge
keep frsnumber PGM_SYS_ID AFFILIATION_TYPE ORG_NAME FIRM_DUNS
rename PGM_SYS_ID pgm_sys_id
sort pgm_sys_id
save "$rootpath/processed_data/DUNS_unmatched_plants.dta", replace

//Merging based on pgm_sys_id
clear 
use "$rootpath/processed_data/DUNS_icis_plants_unmatched_one.dta"
sort pgm_sys_id
quietly by pgm_sys_id: gen dup = cond(_N==1, 0, _n)
drop if dup>1 
drop dup
drop if pgm_sys_id==""
merge 1:m pgm_sys_id using "$rootpath/processed_data/DUNS_unmatched_plants.dta"
keep if _merge==3
keep frsnumber FIRM_DUNS
append using "$rootpath/processed_data/DUNS_merged.dta"
duplicates drop
save "$rootpath/processed_data/DUNS_merged_APR24.dta", replace

